MonolayerCultures / src / AllCells / [RC17]QC.Rmd
[RC17]QC.Rmd
Raw
---
title: "[RC17] QC"
author: "Nina-Lydia Kazakou"
date: "03/03/2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Set-up
### Load libraries
```{r message=FALSE, warning=FALSE}
library(SingleCellExperiment)
library(Seurat)
library(tidyverse)
library(Matrix)
library(scales)
library(cowplot)
library(RCurl)
library(scater)
library(Matrix)
library(ggplot2)
library(edgeR)
library(dplyr)
library(ggsci)  
library(here)
```

### Colour Palette 
```{r load_palette}
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3<-pal_lancet("lanonc", alpha = 0.7)(9)
mypal4<-pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)
mycoloursP<- c(mypal, mypal2, mypal3, mypal4)
```

# Create srt
```{r message=FALSE, warning=FALSE}
data_wk1 <- Read10X(here("data", "CellRanger", "15451WApool01A__hOL_wk1", "filtered_feature_bc_matrix"))
data_wk2 <- Read10X(here("data", "CellRanger", "15451WApool01A__hOL_wk2", "filtered_feature_bc_matrix"))
data_wk3 <- Read10X(here("data", "CellRanger", "15451WApool01A__hOL_wk3", "filtered_feature_bc_matrix"))
```

```{r message=FALSE, warning=FALSE}
wk1 <- CreateSeuratObject(counts = data_wk1, project = "RC17_wk1")
wk2 <- CreateSeuratObject(counts = data_wk2, project = "RC17_wk2")
wk3 <- CreateSeuratObject(counts = data_wk3, project = "RC17_wk3")
```

```{r message=FALSE, warning=FALSE}
merge_object <- merge(x = wk1, y = c(wk2, wk3), add.cell.id = c("wk1", "wk2", "wk3"), project = "RC17")
head(merge_object@meta.data, 2)
```

### Populate meta.data
```{r warning=FALSE}
# Add number of genes per UMI for each cell to metadata.filt
merge_object$log10GenesPerUMI <- log10(merge_object$nFeature_RNA) / log10(merge_object$nCount_RNA)

merge_object[["percent.mt"]] <- PercentageFeatureSet(merge_object, pattern = "^MT-")

chem <- rep("v3.1", ncol(merge_object))
merge_object <- AddMetaData(merge_object, chem, col.name = "Chemistry")
merge_object[["percent.ribo"]] <- PercentageFeatureSet(merge_object, pattern = "^RP[SL]")

metadata.filt <- merge_object@meta.data

metadata.filt$Cells <- rownames(metadata.filt)

metadata.filt$Sample <- NA
t1 <- grepl("^wk1_", metadata.filt$Cells)
t2 <- grepl("^wk2_", metadata.filt$Cells)
t3 <- grepl("^wk3_", metadata.filt$Cells)

metadata.filt$Sample[which(t1)] <- "wk1"
metadata.filt$Sample[which(t2)] <- "wk2"
metadata.filt$Sample[which(t3)] <- "wk3"

# Add metadata.filt back to Seurat object
merge_object@meta.data <- metadata.filt
head(merge_object@meta.data, 2)
```

```{r message=FALSE, warning=FALSE}
dir.create(here("data", "Processed"))
dir.create(here("data", "Processed", "AllCells"))
saveRDS(merge_object, here("data", "Processed", "AllCells", "build_srt.rds"))
```

### Generate sce object
```{r message=FALSE, warning=FALSE}
my_sce <- as.SingleCellExperiment(merge_object)
my_sce
nrow(my_sce)
```

# QC
## Filter Genes 
```{r Gene_Filtering}
keep_feature <- rowSums(counts(my_sce) > 0) >= 10
my_sce <- my_sce[keep_feature, ]
nrow(my_sce)
```


```{r message=FALSE, warning=FALSE}
df1 <- perFeatureQCMetrics(my_sce)
df1 # mean: numeric, the mean counts for each feature, detected: numeric, the percentage of observations above threshold
my_sce <- addPerFeatureQC(my_sce)
colnames(rowData(my_sce))

dir.create(here("outs", "Processed"))
dir.create(here("outs", "Processed", "AllCells"))
write.csv(df1, here("outs", "Processed", "AllCells", "FeautesQC.csv"))
```


## Filter Cells
I will use the scater package to add some quality information per cell. This computes for each cell some useful metrics such as the number of umi counts (library size), the number of detected genes and the percentage of mitochondiral genes.
```{r Cell_Filtering}
my_sce <- addPerCellQC(my_sce, subsets=list(Mito=grep("MT-", rownames(my_sce))))

df <- perCellQCMetrics(my_sce, subsets=list(Mito=grep("MT-", rownames(my_sce))))
head(df, 2)
```
I will first plot the distibution of cells based on Sample ID, i.e, timepoint to see they follow a "typical" distribution:
```{r}
plotColData(my_sce, x = "sum", y="detected", colour_by="Sample") + ggtitle("Total Cells")
plotColData(my_sce, x = "Sample", y="sum") + ggtitle("Total Count")
plotColData(my_sce, x = "Sample", y="detected") + ggtitle("Detected Features")
```

I will use the automatic isOutlier function to determine which values in a numeric vector are outliers based on the median absolute deviation (MAD). Any cell with >3MADs is marked as an outlier. When using this function with low number a log transformation is added, that prevents negative thresholds.
I will also check if the applied automatic thresholds are right for my cells; if not, I will set them manually, based on the graphs above. 
```{r UMI_Low}
qc.lib.lower <- isOutlier(df$sum, log=TRUE, type="lower")
lowUMI<-as.numeric(attr(qc.lib.lower, "thresholds")[2])
if(lowUMI == Inf){
lowUMI <-0
}
if(min(df$sum)>= lowUMI){
print("Filter for low UMI counts are too permissive. Apply manual set threshold. Default is 100")
qc.lib.lower<- df$sum<20
lowUMI<-20
}else{
print("Calculated thresholds seem to filter for low UMI counts")
}
```
Set a manual threshold:
```{r}
qc.lib.low <- df$sum < 100
```

```{r UMI_High}
qc.lib.higher <-isOutlier(df$sum, log=TRUE, type="higher")
higherUMI<-as.numeric(attr(qc.lib.higher, "thresholds")[2])
if(max(df$sum)<= higherUMI){
  print("Filter for high UMI counts too permissive. Apply manual set threshold.")
  qc.lib.higher<- df$sum > 30000
  higherUMI<-30000
}else{
  print("Calculated thresholds seem to filter for high UMI counts")
}
```

```{r}
attr(qc.lib.higher, "thresholds")
```

```{r Gene_Low}
qc.nexprs.lower <- isOutlier(df$detected, log=TRUE, type="lower")
lowerFeatures<-as.numeric(attr(qc.nexprs.lower, "thresholds")[2])
if(min(df$detected)>= lowerFeatures | lowerFeatures < 10){
  print("Filter for low gene counts too permissive. Apply manual set threshold.")
  qc.nexprs.lower<-(df$detected)<10
  lowerFeatures<-10
}else{
  print("Calculated thresholds seem to filter for low gene counts")
}
```

```{r}
attr(qc.nexprs.lower, "thresholds")
```

```{r Gene_High}
qc.nexprs.higher <- isOutlier(df$detected, log=TRUE, type="higher")
higherFeatures<-as.numeric(attr(qc.nexprs.higher, "thresholds")[2])
if(max(df$detected)<= higherFeatures){
  print("Filter for high gene counts too permissive. Apply manual set threshold.")
  qc.nexprs.higher<-(df$detected)> 8000
  higherFeatures<-8000
}else{
  print("Calculated thresholds seem to filter for high gene counts")
}
```
Set manual threshold:
```{r}
qc.nexprs.high <- df$detected > 7000
```

```{r Mito}
qc.mito <- isOutlier(df$subsets_Mito_percent, type="higher")
attr(qc.mito, "thresholds")
percMitoThres<- as.numeric(attr(qc.mito, "thresholds"))
```
Create a vector containing all the outliers:
```{r}
outlier <- qc.lib.low | qc.lib.higher | qc.nexprs.lower | qc.nexprs.high | qc.mito
```

Save thresholds into a df:
```{r}
summary_df <- data.frame(
  lib_size_low = sum(qc.lib.low), lib_size_low_threshold = lowUMI,  
  
  lib_size_high = sum(qc.lib.higher), lib_size_high_threshold = higherUMI, 

  expression_low = sum(qc.nexprs.lower), expression_low_threshold = lowerFeatures,
  
  expression_high = sum(qc.nexprs.high), expression_high_threshold = higherFeatures,
  
  mt_pct = sum(qc.mito), my_pct_threshold = percMitoThres,
  total = sum(outlier))

row.names(summary_df) <- c("Cells filtered", "Threshold")

write.csv(summary_df, here("outs", "Processed", "AllCells", "CellQC.csv"))

my_sce$outlier <- outlier
```

Plot the sample distribution; highlight the outliers:
```{r}
plotColData(my_sce, x = "sum", y = "detected", colour_by = "Sample") + ggtitle("Total Cells") 
plotColData(my_sce, x = "Sample", y = "sum", colour_by = "outlier") + ggtitle("Total Count")
plotColData(my_sce, x = "Sample", y = "detected", colour_by = "outlier") + ggtitle("Detected Features")
plotColData(my_sce, x="Sample", y = "subsets_Mito_percent", colour_by = "outlier") + ggtitle("Mito percent")
plotColData(my_sce, x="sum", y = "subsets_Mito_percent", colour_by = "outlier", other_fields = "Sample") + facet_wrap(~Sample)
```

```{r}
lost <- calculateAverage(counts(my_sce)[,!outlier])
kept <- calculateAverage(counts(my_sce)[,outlier])
logged <- cpm(cbind(lost, kept), log=TRUE, prior.count=2)
logFC <- logged[,1] - logged[,2]
abundance <- rowMeans(logged)
plot(abundance, logFC, xlab="Average count", ylab="Log-FC (lost/kept)", pch=16)
points(abundance[Mito=grep("MT-", rownames(my_sce))], logFC[Mito=grep("MT-", rownames(my_sce))], col="dodgerblue", pch=16)
```


I can now exclude the outliers & filter my object:
```{r}
filter_sce <- my_sce[,!outlier] 
dim(filter_sce)
```



```{r}
saveRDS(filter_sce, here("data", "Processed", "AllCells", "RC17_scaterQC_sce.rds"))  
```


```{r}
sessionInfo()
```